Environment Initialization

library(fivethirtyeight) # new to you!
library(moderndive)
library(skimr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(GGally) # new to you!
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(corrr)

Data

The data we are interest is a subset of hate_crimes data set. The variables we are interested in have the following meanings:

avg_hatecrimes_per_100k_fbi: Average annual hate crimes per 100,000 population, FBI, 2010-2015

gini_index: Gini Index, 2015

share_pop_hs: Share of adults 25 and older with a high-school degree, 2009

share_vote_trump: Share of 2016 U.S. presidential voters who voted for Donald Trump

EDA Part 1

Raw Data

glimpse(hate_crimes)  
## Observations: 51
## Variables: 12
## $ state                       <chr> "Alabama", "Alaska", "Arizona", "A...
## $ median_house_inc            <int> 42278, 67629, 49254, 44922, 60487,...
## $ share_unemp_seas            <dbl> 0.060, 0.064, 0.063, 0.052, 0.059,...
## $ share_pop_metro             <dbl> 0.64, 0.63, 0.90, 0.69, 0.97, 0.80...
## $ share_pop_hs                <dbl> 0.821, 0.914, 0.842, 0.824, 0.806,...
## $ share_non_citizen           <dbl> 0.02, 0.04, 0.10, 0.04, 0.13, 0.06...
## $ share_white_poverty         <dbl> 0.12, 0.06, 0.09, 0.12, 0.09, 0.07...
## $ gini_index                  <dbl> 0.472, 0.422, 0.455, 0.458, 0.471,...
## $ share_non_white             <dbl> 0.35, 0.42, 0.49, 0.26, 0.61, 0.31...
## $ share_vote_trump            <dbl> 0.63, 0.53, 0.50, 0.60, 0.33, 0.44...
## $ hate_crimes_per_100k_splc   <dbl> 0.12583893, 0.14374012, 0.22531995...
## $ avg_hatecrimes_per_100k_fbi <dbl> 1.8064105, 1.6567001, 3.4139280, 0...

The data is made up of 12 variables, 11 numerical and 1 categorical, the state. There is one row per state.

hate_crimes %>% 
  count(state, sort = TRUE)
## # A tibble: 51 x 2
##    state                    n
##    <chr>                <int>
##  1 Alabama                  1
##  2 Alaska                   1
##  3 Arizona                  1
##  4 Arkansas                 1
##  5 California               1
##  6 Colorado                 1
##  7 Connecticut              1
##  8 Delaware                 1
##  9 District of Columbia     1
## 10 Florida                  1
## # ... with 41 more rows

From the table we can see that the data includes all 50 states and the District of Colombia.

Summary Statistics

crime <- select(hate_crimes, 
                    avg_hatecrimes_per_100k_fbi,
                    share_pop_hs,
                    gini_index,
                    share_vote_trump)
skim(crime)
## Skim summary statistics
##  n obs: 51 
##  n variables: 4 
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────────────
##                     variable missing complete  n mean    sd   p0  p25  p50
##  avg_hatecrimes_per_100k_fbi       1       50 51 2.37 1.71  0.27 1.29 1.99
##                   gini_index       0       51 51 0.45 0.021 0.42 0.44 0.45
##                 share_pop_hs       0       51 51 0.87 0.034 0.8  0.84 0.87
##             share_vote_trump       0       51 51 0.49 0.12  0.04 0.41 0.49
##   p75  p100     hist
##  3.18 10.95 ▇▇▅▁▁▁▁▁
##  0.47  0.53 ▅▅▇▇▁▁▁▁
##  0.9   0.92 ▂▅▅▃▃▅▇▆
##  0.57  0.7  ▁▁▁▃▇▇▆▃

Visualizations:

ggplot(crime, aes(x = avg_hatecrimes_per_100k_fbi)) +
  geom_density() +
  labs(x = "", title = "Hate Crimes")
## Warning: Removed 1 rows containing non-finite values (stat_density).

# Histogram of share_pop_hs (IV):
ggplot(crime, aes(x = share_pop_hs)) +
  geom_density() +
  labs(x = "", title = "HS")

# Histogram of gini (IV):
ggplot(crime, aes(x = gini_index)) +
  geom_density() +
  labs(x = "", title = "Gini")

# Histogram of trump (IV):
ggplot(crime, aes(x = share_vote_trump)) +
  geom_density() +
  labs(x = "", title = "Trump")

hate_demo <- crime %>% 
  mutate(
    cat_trump = case_when(
      share_vote_trump < .5 ~ "less than half", 
      TRUE ~ "more than half"
      )) %>% 
  mutate(cat_trump = as.factor(cat_trump)) %>% 
  select(-share_vote_trump)

EDA Part 2

Now lets look at the way each variable correlates to the variable of interest.

correlate(select(crime, avg_hatecrimes_per_100k_fbi, gini_index))
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## # A tibble: 2 x 3
##   rowname                     avg_hatecrimes_per_100k_fbi gini_index
##   <chr>                                             <dbl>      <dbl>
## 1 avg_hatecrimes_per_100k_fbi                      NA          0.421
## 2 gini_index                                        0.421     NA
correlate(select(crime, avg_hatecrimes_per_100k_fbi, share_pop_hs))
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## # A tibble: 2 x 3
##   rowname                     avg_hatecrimes_per_100k_fbi share_pop_hs
##   <chr>                                             <dbl>        <dbl>
## 1 avg_hatecrimes_per_100k_fbi                      NA            0.164
## 2 share_pop_hs                                      0.164       NA
correlate(select(crime, avg_hatecrimes_per_100k_fbi, share_vote_trump))
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## # A tibble: 2 x 3
##   rowname                     avg_hatecrimes_per_100k_fbi share_vote_trump
##   <chr>                                             <dbl>            <dbl>
## 1 avg_hatecrimes_per_100k_fbi                      NA               -0.505
## 2 share_vote_trump                                 -0.505           NA

There is a 0.4212719 correlation between hate crimes and gini index value. There is a 0.1641476 correlation between hate crimes and high school grad rate. There is a 0.5045316 correlation between hate crimes and trump voter rate.

ggplot(crime, aes(x = share_pop_hs, y = avg_hatecrimes_per_100k_fbi)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(crime, aes(x = gini_index, y = avg_hatecrimes_per_100k_fbi)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(crime, aes(x = share_vote_trump, y = avg_hatecrimes_per_100k_fbi)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

ggpairs(hate_demo, aes(color = cat_trump))
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removing 1 row that contained a missing value
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removing 1 row that contained a missing value
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Multiple Regression Model

Now we will

hate_crime_model <- lm( avg_hatecrimes_per_100k_fbi ~ gini_index + share_pop_hs, data=crime)
get_regression_table(hate_crime_model)
## # A tibble: 3 x 7
##   term         estimate std_error statistic p_value lower_ci upper_ci
##   <chr>           <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept       -54.0      9.76     -5.53       0    -73.7    -34.4
## 2 gini_index       64.3     11.1       5.79       0     42.0     86.7
## 3 share_pop_hs     31.3      6.81      4.59       0     17.6     45.0
summary(hate_crime_model)
## 
## Call:
## lm(formula = avg_hatecrimes_per_100k_fbi ~ gini_index + share_pop_hs, 
##     data = crime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7064 -0.9064  0.0390  0.7721  3.4995 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -54.017      9.763  -5.533 1.36e-06 ***
## gini_index     64.324     11.106   5.792 5.55e-07 ***
## share_pop_hs   31.287      6.813   4.592 3.29e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.319 on 47 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.4322, Adjusted R-squared:  0.408 
## F-statistic: 17.89 on 2 and 47 DF,  p-value: 1.673e-06

The regression table seems to indicate that when you combine the two independent variables into the model you end up with a very steep line. There is an intercept at -54 hate crimes, which is not possible, with a rise of 64 hate crimes for a movement of one in the positive gini index axis and a rise in 34 hate crimes for moving to 100 percent of the population HS educated.

This is drastically different than what we saw in the simple models, where there were not very steep lines cutting through the data. The multi- regression seems to have caused a steep cut up through the data.

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
dim_scatter <- plot_ly(crime, 
                       x = ~share_pop_hs, 
                       y = ~gini_index, 
                       z = ~avg_hatecrimes_per_100k_fbi) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'HS'),
                     yaxis = list(title = 'Gini'),
                     zaxis = list(title = 'Hate Crimes')))
dim_scatter
## Warning: Ignoring 1 observations